
# Replication Files

# "Foreign Policy Failures and the Attractiveness of Great Powers"
# by Rachel Myrick and William Marble, BJPolS 

# scrape BBC headlines around time of withdrawal from Afghanistan
library(tidyverse)
library(rvest)
library(polite)
library(lubridate)


get_bbc_headline <- function(sess) {
  page <- scrape(sess)
  
  cards <- html_elements(page, "[data-testid='liverpool-article']")
  
  # helper to get text safely (NA if missing)
  get_txt <- function(node, css) {
    el <- html_element(node, css)
    if (length(el)) html_text2(el) else NA_character_
  }
  
  tibble(
    headline    = map_chr(cards, get_txt, "[data-testid='card-headline']"),
    description = map_chr(cards, get_txt, "[data-testid='card-description']"),
    
    # nearest wrapping <a> -- link to story
    url = cards |>
      html_element(xpath = "./ancestor::a[1]") |>
      html_attr("href") |>
      url_absolute("https://www.bbc.com")
  ) |>
    distinct() 
}


# load dates already completed - so can restart if there are interruptions
if (file.exists("Data/bbc-headlines/completed-dates.csv")) {
  completed <- read.csv("Data/bbc-headlines/completed-dates.csv")
} else {
  completed <- data.frame(date = character(0))
}

# loop through days
path_base <- "http://www.bbc.com/pages/archive/2021/%s/%s"
dates <- tibble(date = seq(ymd("2021-06-16"), ymd("2021-10-14"), by = "days"))
dates <- dates %>% 
  mutate(month = sprintf("%02d", month(date)),
         day = sprintf("%02d", day(date))) %>% 
  mutate(url = sprintf(path_base, month, day))

all_res <- list()
for (d in seq_along(dates$date)) {
  
  date <- dates$date[d]
  if (date %in% ymd(completed$date)) next
  rlang::inform(c("i" = paste0("Scraping articles from ", as.character(date), "\n")))
  
  more_pages <- TRUE
  page <- 1
  date_res <- list()
  while (more_pages) {
    sess <- bow(paste0(dates$url[d], "?page=", page))
    content <- get_bbc_headline(sess)
    date_res[[page]] <- content
    if (nrow(content) == 0) more_pages <- FALSE else page <- page + 1
    Sys.sleep(1)
  }
  date_res <- bind_rows(date_res) %>% mutate(date = date)
  write.csv(date_res, sprintf("Data/bbc-headlines/%s.csv", date), row.names = FALSE)
  completed <- rbind(completed, data.frame(date = as.character(date)))
  write.csv(completed, "Data/bbc-headlines/completed-dates.csv", row.names = FALSE)
}
